# Subselect column after download -> TODO remove
#for (city in CITIES) {
# tmp_df = read.csv(glue::glue("data/airbnb/{city}.csv"))
# write.csv(tmp_df %>% select(DATASET_COLUMNS), glue::glue("data/airbnb/{city}.csv"), row.names=FALSE)
#}
# Define column groups
df_columns = colnames(df)
host_columns = df_columns[grepl('Host', df_columns)]
review_columns = df_columns[grepl('Review', df_columns)]
price_columns = c('Price', 'Security.Deposit', 'Cleaning.Fee', 'Cancellation.Policy')
services_columns = c('Instant_Bookable', 'Internet', 'Kitchen', 'Washer', 'Breakfast', 'Air_conditioning', 'Experiences.Offered')
accomodation_columns = c('Property.Type', 'Room.Type', 'Accommodates', 'Bathrooms', 'Bedrooms', 'Beds', 'Bed.Type', 'Minimum.Nights')
summary(df[, c('City')])
## City
## Amsterdam:3351
## Barcelona:3678
## Berlin :2802
## London :2816
## Roma :3350
## Wien :3271
summary(df[, host_columns])
## Host.Since Host.Response.Time Host.Response.Rate Host.ProfilePic
## Min. :1529 a few days or more: 501 Min. : 0.0 Mode :logical
## 1st Qu.:2057 within a day : 3689 1st Qu.: 99.0 FALSE:36
## Median :2463 within a few hours: 4793 Median :100.0 TRUE :19232
## Mean :2532 within an hour :10285 Mean : 94.2
## 3rd Qu.:2960 3rd Qu.:100.0
## Max. :4709 Max. :100.0
## Host.SuperHost Host.verified
## Mode :logical Mode :logical
## FALSE:16610 FALSE:7471
## TRUE :2658 TRUE :11797
##
##
##
summary(df[, review_columns])
## Number.of.Reviews Review.Scores.Rating Review.Scores.Accuracy
## Min. : 1.00 Min. : 20.00 Min. : 2.000
## 1st Qu.: 4.00 1st Qu.: 90.00 1st Qu.: 9.000
## Median : 11.00 Median : 95.00 Median :10.000
## Mean : 24.26 Mean : 92.59 Mean : 9.506
## 3rd Qu.: 29.00 3rd Qu.: 98.00 3rd Qu.:10.000
## Max. :417.00 Max. :100.00 Max. :10.000
## Review.Scores.Cleanliness Review.Scores.Checkin Review.Scores.Communication
## Min. : 2.000 Min. : 2.000 Min. : 2.000
## 1st Qu.: 9.000 1st Qu.: 9.000 1st Qu.:10.000
## Median :10.000 Median :10.000 Median :10.000
## Mean : 9.367 Mean : 9.667 Mean : 9.662
## 3rd Qu.:10.000 3rd Qu.:10.000 3rd Qu.:10.000
## Max. :10.000 Max. :10.000 Max. :10.000
## Review.Scores.Location Review.Scores.Value
## Min. : 2.000 Min. : 2.000
## 1st Qu.: 9.000 1st Qu.: 9.000
## Median :10.000 Median : 9.000
## Mean : 9.406 Mean : 9.264
## 3rd Qu.:10.000 3rd Qu.:10.000
## Max. :10.000 Max. :10.000
summary(df[, price_columns])
## Price Security.Deposit Cleaning.Fee Cancellation.Policy
## Min. : 9.00 Min. : 0.00 Min. : 0.00 strict :6838
## 1st Qu.: 45.00 1st Qu.: 0.00 1st Qu.: 0.00 moderate :5228
## Median : 70.00 Median : 0.00 Median : 20.00 flexible :3843
## Mean : 89.87 Mean : 93.91 Mean : 24.64 moderate_new:1438
## 3rd Qu.:110.00 3rd Qu.:150.00 3rd Qu.: 40.00 strict_new :1038
## Max. :999.00 Max. :999.00 Max. :375.00 flexible_new: 867
## (Other) : 16
summary(df[, services_columns])
## Instant_Bookable Internet Kitchen Washer
## Mode :logical Mode:logical Mode :logical Mode :logical
## FALSE:13234 TRUE:19268 FALSE:1680 FALSE:4729
## TRUE :6034 TRUE :17588 TRUE :14539
##
##
## Breakfast Air_conditioning Experiences.Offered
## Mode :logical Mode :logical business: 54
## FALSE:16957 FALSE:14304 family : 53
## TRUE :2311 TRUE :4964 none :19095
## romantic: 26
## social : 40
summary(df[, accomodation_columns])
## Property.Type Room.Type Accommodates
## Apartment :16071 Entire home/apt:12119 Min. : 1.000
## House : 1324 Private room : 6972 1st Qu.: 2.000
## Bed & Breakfast: 810 Shared room : 177 Median : 3.000
## Condominium : 298 Mean : 3.312
## Loft : 231 3rd Qu.: 4.000
## Boat : 135 Max. :16.000
## (Other) : 399
## Bathrooms Bedrooms Beds Bed.Type
## Min. :0.000 Min. : 0.000 Min. : 1.000 Airbed : 16
## 1st Qu.:1.000 1st Qu.: 1.000 1st Qu.: 1.000 Couch : 46
## Median :1.000 Median : 1.000 Median : 2.000 Futon : 115
## Mean :1.208 Mean : 1.368 Mean : 2.039 Pull-out Sofa: 336
## 3rd Qu.:1.000 3rd Qu.: 2.000 3rd Qu.: 2.000 Real Bed :18755
## Max. :8.000 Max. :10.000 Max. :16.000
##
## Minimum.Nights
## Min. : 1.000
## 1st Qu.: 1.000
## Median : 2.000
## Mean : 2.865
## 3rd Qu.: 3.000
## Max. :200.000
##
# Aggregate data to do some plotting
agg_df = df %>%
select(City, Latitude, Longitude, Price) %>%
group_by(City) %>%
summarise(across(everything(), list(mean))) %>%
mutate(lat = Latitude_1,
long = Longitude_1,
price = Price_1)
pal <- colorNumeric(
palette = "viridis",
domain = df$price)
## Warning: Unknown or uninitialised column: `price`.
labs <- lapply(seq(nrow(agg_df)), function(i) {
paste0( '<p>', '<b>City:</b> ', agg_df[i, "City"][[1]], '<p></p>',
'<b>Avg Price per night:</b> ', round(agg_df[i, "price"], 2), '</p>' )
})
m<-leaflet(agg_df) %>% addProviderTiles('CartoDB.Positron') %>%
addCircleMarkers(lng= ~long, lat= ~lat, color= ~pal(price), radius = ~price/5, label = lapply(labs, htmltools::HTML)) %>%
addLegend(pal = pal, values = ~price, title = "Average price",
labFormat = labelFormat(suffix = "€"), opacity = 1)
fig = plot_ly(df, y = ~Price, color = ~City, type = "box")
p = bscols(m, fig)
p
atmp_df = df %>% filter(City == 'Amsterdam', Price < 250) %>% sample_n(1000)
batmp_df = df %>% filter(City == 'Barcelona', Price < 250) %>% sample_n(1000)
betmp_df = df %>% filter(City == 'Berlin', Price < 250) %>% sample_n(1000)
ltmp_df = df %>% filter(City == 'London', Price < 250) %>% sample_n(1000)
rtmp_df = df %>% filter(City == 'Roma', Price < 250) %>% sample_n(1000)
wtmp_df = df %>% filter(City == 'Wien', Price < 250) %>% sample_n(1000)
labs <- lapply(seq(nrow(atmp_df)), function(i) {
paste0( '<p>', '<b>City:</b> ', atmp_df[i, "City"][[1]], '<p></p>',
'<b>Avg Price per night:</b> ', round(atmp_df[i, "Price"], 2), '</p>',
'<b>Dist from center:</b> ', round(atmp_df[i, "dist_from_city_center"], 2), '</p>')
})
ma<-leaflet(atmp_df) %>% addProviderTiles('CartoDB.Positron') %>%
addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label = lapply(labs, htmltools::HTML)) %>%
addLegend(pal = pal, values = ~Price, title = "Average price",
labFormat = labelFormat(suffix = "€"))
mba<-leaflet(batmp_df) %>% addProviderTiles('CartoDB.Positron') %>%
addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
addLegend(pal = pal, values = ~Price, title = "Average price",
labFormat = labelFormat(suffix = "€"))
p1 = bscols(ma, mba)
mbe<-leaflet(betmp_df) %>% addProviderTiles('CartoDB.Positron') %>%
addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
addLegend(pal = pal, values = ~Price, title = "Average price",
labFormat = labelFormat(suffix = "€"))
ml<-leaflet(ltmp_df) %>% addProviderTiles('CartoDB.Positron') %>%
addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
addLegend(pal = pal, values = ~Price, title = "Average price",
labFormat = labelFormat(suffix = "€"))
p2 = bscols(mbe, ml)
mr<-leaflet(rtmp_df) %>% addProviderTiles('CartoDB.Positron') %>%
addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
addLegend(pal = pal, values = ~Price, title = "Average price",
labFormat = labelFormat(suffix = "€"))
mw<-leaflet(wtmp_df) %>% addProviderTiles('CartoDB.Positron') %>%
addCircleMarkers(lng= ~Longitude, lat= ~Latitude, color= ~pal(Price), radius=1, label=~Price) %>%
addLegend(pal = pal, values = ~Price, title = "Average price",
labFormat = labelFormat(suffix = "€"))
p3 = bscols(mr, mw)
p1
p2
p3
TODO - Compute distance_from_center column - show better stats/graphs on group of columns (as did above) - compute correlation matrix (qualitative and quantitative) to show what drives prices (and if it differs by city) - Exclude useless columns by correlation above or PCA - try clustering by city and clustering the entire dataset to show if changes - try and predict a price of something - evaluate model - Try and predict the city of a room based on other charateristics - try clusters with and w/out prices and see if prices variance is small in the clusters - Try and predict the review scores -